02 - R framework with IMPACT - session 2

Author

Yann Say

Published

April 10, 2024

library(cleaningtools)
library(dplyr)

my_raw_dataset <- cleaningtools::cleaningtools_raw_data
my_kobo_survey <- cleaningtools::cleaningtools_survey
my_kobo_choice <- cleaningtools::cleaningtools_choices

Recap

more_logs <- my_raw_dataset %>% 
  check_duplicate(uuid_column = "X_uuid") %>% 
  check_soft_duplicates(uuid_column = "X_uuid", kobo_survey = my_kobo_survey, sm_separator = ".") %>%
  check_outliers(uuid_column = "X_uuid") %>%
  check_value(uuid_column = "X_uuid") 
more_logs$checked_dataset <- more_logs$checked_dataset %>% 
  add_duration(uuid_column = "X_uuid", start_column = "X.U.FEFF.start", end_column = "end")
more_logs <- more_logs %>% 
  check_duration(column_to_check = "duration", uuid_column = "X_uuid")
other_columns_to_check <- my_kobo_survey %>% 
  filter(type == "text") %>% 
  filter(name %in% names(my_raw_dataset)) %>%
  pull(name) 

more_logs <- more_logs %>% 
  check_others(uuid_column = "X_uuid", columns_to_check = other_columns_to_check) 
logical_check_list <- readxl::read_excel("inputs/01 - example - check_list.xlsx")
more_logs <- more_logs %>% 
  check_logical_with_list(uuid_column = "X_uuid",
                          list_of_check = logical_check_list,
                          check_id_column = "check_id",
                          check_to_perform_column = "check_to_perform",
                          columns_to_clean_column = "columns_to_clean",
                          description_column = "description")

Creating a cleaning log

create_*

create_* functions will create, transform something, e.g. creating a cleaning log with the checks to be filled, create analysis results table, create an output.

Outputs from create_* functions outputs can be in different shape, format, etc.

create_* function is catch-all.

create_combined_log

create_combined_log will combined all the logs from a list into one. It will also add 2 columns that will be used for the cleaning.

names(more_logs)
[1] "checked_dataset"    "duplicate_log"      "soft_duplicate_log"
[4] "potential_outliers" "flaged_value"       "duration_log"      
[7] "other_log"          "logical_all"       
my_combined_log <- create_combined_log(more_logs)
List of element to combine- checked_dataset, duplicate_log, soft_duplicate_log, potential_outliers, flaged_value, duration_log, other_log, logical_all
typeof(my_combined_log)
[1] "list"
names(my_combined_log)
[1] "checked_dataset" "cleaning_log"   
my_combined_log$cleaning_log %>% 
  head()
uuid old_value question issue check_id check_binding change_type new_value
b5b1d37a-e27a-4c35-a0f5-2cde9d6dfd06 86 age_respondent_r outlier (normal distribution) NA age_respondent_r / b5b1d37a-e27a-4c35-a0f5-2cde9d6dfd06 NA NA
956b5ed0-5a62-41b7-aec3-af93fbc5b494 84 age_respondent_r outlier (normal distribution) NA age_respondent_r / 956b5ed0-5a62-41b7-aec3-af93fbc5b494 NA NA
97ad6294-30c6-454e-a0b3-42126415b767 18 age_respondent_r outlier (log distribution) NA age_respondent_r / 97ad6294-30c6-454e-a0b3-42126415b767 NA NA
e005e719-57c4-44a3-ac2f-5d6d1ff68831 18 age_respondent_r outlier (log distribution) NA age_respondent_r / e005e719-57c4-44a3-ac2f-5d6d1ff68831 NA NA
c9aaa542-118f-4e42-93de-fb0916572541 19 num_hh_member outlier (normal distribution) NA num_hh_member / c9aaa542-118f-4e42-93de-fb0916572541 NA NA
48e8896b-d1be-4600-8839-2d8b994ebcfb 19 num_hh_member outlier (normal distribution) NA num_hh_member / 48e8896b-d1be-4600-8839-2d8b994ebcfb NA NA

The cleaning log contains all the columns from all the logs from more_logs with in addition:

  • check_binding is filled for all rows.
  • change_type (empty)
  • new_value (empty)

add_info_to_cleaning_log

If more information from the dataset should be added, the function add_info_to_cleaning_log can help.

add_*

add_* functions will add a variable (column) to the dataset. For example, to add the duration of a survey, to add the food consumption score category, etc.

add_* function takes a dataset as input and returns the dataset + the new indicator (and any intermediate steps used for the calculation).

For example, to check the duration of a survey, there is only the start and end, but not the duration column.

my_combined_log <- my_combined_log %>% 
  add_info_to_cleaning_log(dataset_uuid_column = "X_uuid", 
                           information_to_add = "enumerator_num")

my_combined_log$cleaning_log %>% 
  head()
uuid old_value question issue check_id check_binding change_type new_value enumerator_num
019bc718-c06a-46b8-bba8-c84f6c6efbd5 لا اعلم water_supply_other_neighbourhoods_why recode other NA water_supply_other_neighbourhoods_why / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 NA NA 12
019bc718-c06a-46b8-bba8-c84f6c6efbd5 لا ارى جدوى من ذلك prefer_not_engage_other recode other NA prefer_not_engage_other / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 NA NA 12
019bc718-c06a-46b8-bba8-c84f6c6efbd5 السلطات ليست مهتمة بالخدمات trust_water_office_why_not recode other NA trust_water_office_why_not / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 NA NA 12
019bc718-c06a-46b8-bba8-c84f6c6efbd5 247.20 duration Duration is lower or higher than the thresholds NA duration / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 NA NA 12
03183d24-0275-43fe-8976-d076f29de590 عدم توفير خدمه المياه في المنطقه وضعف في تزويد خدمه وعدد ساعات اقل تجهيز للبيوت water_supply_other_neighbourhoods_why recode other NA water_supply_other_neighbourhoods_why / 03183d24-0275-43fe-8976-d076f29de590 NA NA 2
03183d24-0275-43fe-8976-d076f29de590 10 pay_water_charges_amount outlier (log distribution) NA pay_water_charges_amount / 03183d24-0275-43fe-8976-d076f29de590 NA NA 2

create_xlsx_cleaning_log

create_xlsx_cleaning_log(my_combined_log,
                         sm_dropdown_type = "logical",
                         output_path =  "outputs/01 - example - cleaning-log-no-kobo.xlsx")

create_xlsx_cleaning_log will write an excel file with:

  • checked_dataset tab: the checked dataset, with additional columns if any.
  • cleaning_log tab: the combined log with the change_type column with a data validation rules.
  • readme tab: change_type values definition.

There are 4 actions possible:

  • change_response: Change the response to new.value
  • blank_response: Remove and NA the response
  • remove_survey: Delete the survey
  • no_action: No action to take.

This log will have to be filled in with actions to take and new value if needed.

create_xlsx_cleaning_log(my_combined_log,
                         kobo_survey = my_kobo_survey,
                         kobo_choices = my_kobo_choice,
                         use_dropdown = T,
                         sm_dropdown_type = "logical",
                         output_path =  "outputs/02 - example - cleaning-log-with-kobo.xlsx")

If the KOBO information are provided and the use_dropdown argument is set to TRUE, new_value will have a data validation rule based on the KOBO options.

Note

Select multiple dummy columns (TRUE/FALSE or 1/0) are flagged and used later for the cleaning, not the parent column.

Creating a clean dataset

Important

The cleaning has to be filled before moving forward. The above steps are to create the cleaning log, not to fill it or clean the dataset. Filling the cleaning log is not an automatic step for this process. The decision to change a value has to be recorded.

review_*

review_* functions will review an object by comparing it to standards or another object and flags differences, e.g. reviewing the cleaning by comparing the raw dataset, the clean dataset and the cleaning log, analysis comparing it with another analysis.

  • if the cleaning has been filled correctly
  • the cleaning has been done correctly
  • comparing indicators
  • comparing analysis
  • etc.

review_cleaning_log

review_cleaning_log will review the filled cleaning log.

my_filled_log <- readxl::read_excel("inputs/02 - example - cleaning-log-with-kobo - filled.xlsx", sheet = 2)

check_log_results <- review_cleaning_log(raw_dataset = my_raw_dataset,
                                        raw_data_uuid_column = "X_uuid",
                                        cleaning_log = my_filled_log, 
                                        cleaning_log_uuid_column = "uuid",
                                        cleaning_log_question_column = "question",
                                        cleaning_log_new_value_column = "new_value",
                                        cleaning_log_change_type_column = "change_type",
                                        change_response_value = "change_response")
check_log_results
[1] "no issues in cleaning log found"

create_clean_data

To create the clean dataset, create_clean_data will use the raw dataset and the filled cleaning log.

my_clean_data <- create_clean_data(raw_dataset = my_raw_dataset,
                                   raw_data_uuid_column = "X_uuid",
                                   cleaning_log = my_filled_log, 
                                   cleaning_log_uuid_column = "uuid",
                                   cleaning_log_question_column = "question",
                                   cleaning_log_new_value_column = "new_value",
                                   cleaning_log_change_type_column = "change_type")
[1] "water_supply_other_neighbourhoods_why"
[1] "trust_water_office_why_not"
[1] "pay_water_charges_amount"
[1] "connection_fees_amount"
[1] "connection_fees_amount"
[1] "primary_livelihood.employment"
[1] "primary_livelihood.employment"
[1] "primary_livelihood.employment"
[1] "tank_emptied"
[1] "access_water_enough"

recreate_parent_column

In the cleaning log, some select multiple are changed, but only the dummy. The parent column should be recoded also.

my_filled_log %>% 
  filter(question == "primary_livelihood.employment", 
         change_type == "change_response") %>% 
  select(uuid, question, old_value, new_value)
uuid question old_value new_value
eb3b9935-fa0b-4d54-8058-3b629f1421ad primary_livelihood.employment TRUE FALSE
eec1f630-15d5-475e-a344-32bba74b32ea primary_livelihood.employment TRUE FALSE
f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb primary_livelihood.employment TRUE FALSE
my_clean_data %>% 
  filter(X_uuid %in% c("eb3b9935-fa0b-4d54-8058-3b629f1421ad", "eec1f630-15d5-475e-a344-32bba74b32ea", "f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb")) %>% 
  select(X_uuid,primary_livelihood, primary_livelihood.employment)
X_uuid primary_livelihood primary_livelihood.employment
f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb employment FALSE
eb3b9935-fa0b-4d54-8058-3b629f1421ad employment ngo FALSE
eec1f630-15d5-475e-a344-32bba74b32ea employment FALSE
my_clean_data2 <- recreate_parent_column(dataset = my_clean_data,
                                         uuid_column = "X_uuid",
                                         kobo_survey = my_kobo_survey,
                                         kobo_choices = my_kobo_choice,
                                         sm_separator = ".", 
                                         cleaning_log_to_append = my_filled_log)
my_clean_data2$data_with_fix_concat %>% 
  filter(X_uuid %in% c("eb3b9935-fa0b-4d54-8058-3b629f1421ad", "eec1f630-15d5-475e-a344-32bba74b32ea", "f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb")) %>% 
  select(X_uuid,primary_livelihood, primary_livelihood.employment)
X_uuid primary_livelihood primary_livelihood.employment
f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb NA FALSE
eb3b9935-fa0b-4d54-8058-3b629f1421ad ngo FALSE
eec1f630-15d5-475e-a344-32bba74b32ea NA FALSE
my_clean_data2$cleaning_log %>% 
  filter(question == "primary_livelihood", 
         uuid %in% c("eb3b9935-fa0b-4d54-8058-3b629f1421ad", "eec1f630-15d5-475e-a344-32bba74b32ea", "f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb")) %>% 
  select(uuid, question, old_value, new_value)
uuid question old_value new_value
eb3b9935-fa0b-4d54-8058-3b629f1421ad primary_livelihood employment ngo ngo
eec1f630-15d5-475e-a344-32bba74b32ea primary_livelihood employment NA
f1175d29-ce1f-43a7-b3d1-ee26cd1b8cdb primary_livelihood employment NA

review_others

In the cleaning log, some opentext values are changed to blank. Some open text questions are linked some skip logic, i.e. what is X? Other, please specify. In some cases, values some values should be changed.

In the example below, the value for water_supply_other_neighbourhoods_why for the uuid 019bc718-c06a-46b8-bba8-c84f6c6efbd5 was changed to NA.

my_filled_log %>% 
  filter(question == "water_supply_other_neighbourhoods_why", 
         change_type == "blank_response")
uuid old_value question issue check_id check_binding change_type new_value enumerator_num
019bc718-c06a-46b8-bba8-c84f6c6efbd5 لا اعلم water_supply_other_neighbourhoods_why recode other NA water_supply_other_neighbourhoods_why / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 blank_response NA 12

The kobo show a skip logic based on water_supply_other_neighbourhoods.

my_kobo_survey %>% 
  filter(name == "water_supply_other_neighbourhoods_why") %>% 
  select(type, name, relevant)
type name relevant
text water_supply_other_neighbourhoods_why selected(\({water_supply_other_neighbourhoods},'somewhat_worse') or selected(\){water_supply_other_neighbourhoods},‘much_worse’)
my_clean_data %>% 
  filter(X_uuid == "019bc718-c06a-46b8-bba8-c84f6c6efbd5") %>% 
  select(water_supply_other_neighbourhoods, water_supply_other_neighbourhoods_why   )
water_supply_other_neighbourhoods water_supply_other_neighbourhoods_why
somewhat_worse NA

Should the value of water_supply_other_neighbourhoods be changed? It depends on the question and skip logic but it important to flag those so a decision can be taken.

review_other_log <- review_others(dataset = my_clean_data2$data_with_fix_concat,
                                  uuid_column = "X_uuid", 
                                  kobo_survey = my_kobo_survey, 
                                  columns_not_to_check = "consent_telephone_number")
Warning in create_logic_for_other(kobo_survey = kobo_survey,
compare_with_dataset = TRUE, : The following parent names: well_quality,
spring_quality, rainwater_quality, surface_quality, why_not_connected were not
found in the dataset. The function is ignoring them.

review_cleaning

my_deletion_log <- my_clean_data2$cleaning_log %>% 
  filter(change_type == "remove_survey")

my_filled_log_no_deletion <- my_clean_data2$cleaning_log %>% 
  filter(change_type != "remove_survey") %>% 
  filter(!uuid %in% my_deletion_log$uuid)

review_of_cleaning <- review_cleaning(raw_dataset = my_raw_dataset,
                    raw_dataset_uuid_column = "X_uuid", 
                    clean_dataset = my_clean_data2$data_with_fix_concat,
                    clean_dataset_uuid_column = "X_uuid",
                    cleaning_log = my_filled_log_no_deletion, 
                    cleaning_log_uuid_column = "uuid",
                    cleaning_log_question_column = "question",
                    cleaning_log_new_value_column = "new_value",
                    cleaning_log_change_type_column = "change_type", 
                    cleaning_log_old_value_column = "old_value", 
                    deletion_log = my_deletion_log, 
                    deletion_log_uuid_column = "uuid"
                    )

Exercises

  • Export the cleaning log you have created previously.
previous_exercise_log <- readRDS("inputs/03 - exercise - previous_log.RDS")

Did you try the function create_combined_log

Did you try the function create_xlsx_cleaning_log

previous_exercise_log %>% 
  create_combined_log() %>%
  create_xlsx_cleaning_log(output_path = "outputs/03 - correction - cleaning_log.xlsx", 
                           kobo_survey = my_kobo_survey,
                           kobo_choices = my_kobo_choice,
                           sm_dropdown_type = "logical",
                           use_dropdown = TRUE)
  • (Optional) Fill-in the log.

  • Create the clean data from the raw dataset and the filled cleaning.

exercise_filled_log <- readxl::read_excel("inputs/04 - exercise - cleaning_log - filled.xlsx", sheet = "cleaning_log")

Did you try the function create_clean_data

Did you try the function recreate_parent_column

exercise_clean_dataset <- create_clean_data(raw_dataset = my_raw_dataset,
                                            raw_data_uuid_column = "X_uuid",
                                            cleaning_log = exercise_filled_log, 
                                            cleaning_log_uuid_column = "uuid",
                                            cleaning_log_question_column = "question",
                                            cleaning_log_new_value_column = "new_value",
                                            cleaning_log_change_type_column = "change_type")


exercise_clean_dataset2 <- recreate_parent_column(exercise_clean_dataset,
                                                  uuid_column = "X_uuid", 
                                                  kobo_survey = my_kobo_survey,
                                                  kobo_choices = my_kobo_choice,
                                                  cleaning_log_to_append = exercise_filled_log)
  • Review the cleaning below, if there is someone else doing the exercise, you can try to review someone’s cleaning.
exercise3_clean_dataset <- readxl::read_excel("inputs/05 - exercise - clean dataset for review.xlsx")

exercise3_cleaning_log <- readxl::read_excel("inputs/05 - exercise - clean dataset for review.xlsx", sheet = 2)

Did you try the function review_cleaning

Did you separate the cleaning log?

exercise3_deletion_log <- exercise3_cleaning_log %>% 
  filter(change_type == "remove_survey")

exercise3_log_no_deletion <- exercise3_cleaning_log %>% 
  filter(change_type != "remove_survey") %>% 
  filter(!uuid %in% exercise3_deletion_log$uuid)

review_of_cleaning <- review_cleaning(raw_dataset = my_raw_dataset,
                                      raw_dataset_uuid_column = "X_uuid", 
                                      clean_dataset = exercise3_clean_dataset,
                                      clean_dataset_uuid_column = "X_uuid",
                                      cleaning_log = exercise3_log_no_deletion, 
                                      cleaning_log_uuid_column = "uuid",
                                      cleaning_log_question_column = "question",
                                      cleaning_log_new_value_column = "new_value",
                                      cleaning_log_change_type_column = "change_type", 
                                      cleaning_log_old_value_column = "old_value", 
                                      deletion_log = exercise3_deletion_log, 
                                      deletion_log_uuid_column = "uuid"
)